In [7]:
import pandas as pd
import numpy
import json
from collections import defaultdict
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
In [8]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
In [3]:
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[0] #cos the format will always end with a |
In [4]:
for col in ['place_of_birth','gender', 'citizenship','ethnic_group']:
allrecs[col] = allrecs[col].apply(split_column)
In [5]:
allrecs.head(5)
Out[5]:
In [6]:
pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
ethnic_group_map = json.load(open('helpers/aggregation_maps/mechanical_turk/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/mechanical_turk/citizenship_map.json','r'))
def map_pob(qid):
if not type(qid) is str:
return None
else:
country_list = pobs_map[qid]
if len(country_list) == 0:
return None
else:
country = country_list[0] #assumption
culture = country_map.ix[country]['culture_name']
return culture
def map_wrapper(m):
def return_fun(qid):
try:
return m[qid]
except KeyError:
return None
return return_fun
mismatch = pd.DataFrame()
#order is important because it determines the preference we will use
col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
[map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])
def determine_culture(row):
culture = None
for col, map_fun in col_map_fun:
guess = map_fun(row[col])
if (culture is not None) and (guess is not None):
if culture != guess:
mismatch.append(row,ignore_index=True)
if guess:
culture = guess
return str(culture).lower() if culture else culture #to return None properly
In [173]:
%%timeit -r 1 -n 1
allrecs.iloc[0:2500].apply(lambda x: determine_culture(x), axis=1)
In [174]:
%%timeit -r 1 -n 1
allrecs.iloc[0:25000].apply(lambda x: determine_culture(x), axis=1)
In [17]:
allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)
In [195]:
print mismatch
In [176]:
allrecs.to_json('helpers/world_cultures_shortcut.json')
In [5]:
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))
In [201]:
import scipy.stats
scipy.stats.spearmanr(rank_compare[['Rank','Rank_wikidata']])
Out[201]:
In [207]:
scipy.stats.mannwhitneyu(rank_compare['Rank'],rank_compare['Rank_wikidata'])
Out[207]:
In [208]:
scipy.stats.ranksums(rank_compare['Rank'],rank_compare['Rank_wikidata'])
Out[208]:
In [205]:
print rank_compare.to_html()
Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high
In [6]:
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
In [7]:
def map_culture(qid):
if not type(qid) is str:
return None
else:
country_list = pobs_map[qid]
if len(country_list) == 0:
return None
else:
country = country_list[0] #assumption
culture = country_map.ix[country]['culture_name']
return culture
In [15]:
allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)
In [32]:
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if qid:
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
else:
return None
In [33]:
english_label('Q6581097')
Out[33]:
In [34]:
allrecs['gender_name'] = allrecs['gender'].apply(english_label)
In [ ]:
outdf = allrecs[['gender_name','culture']]
In [ ]:
outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')
how many records have gender, pob and dob
In [9]:
has = defaultdict(dict)
for col in allrecs.columns:
def test(x):
if isinstance(x, float):
return not math.isnan(x)
else:
return x is not None
nonempty = len(allrecs[allrecs[col].apply(test)])
nonemptyper = nonempty / float(len(allrecs))
has[col]['Items with property'] = nonempty
has[col]['% of total'] = nonemptyper
hasdf = pd.DataFrame.from_dict(has, orient='index')
In [10]:
print hasdf.sort('% of total').to_html(justify='right', formatters={'% of total':lambda x: '%.2f' % (x*100),
'Items with property':lambda x: '{0:,}'.format(x)})
In [11]:
hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)
Out[11]:
In [12]:
hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)
Out[12]:
In [13]:
hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)
Out[13]:
In [14]:
hascult.head()
Out[14]:
In [15]:
culture_groups = hascult.groupby('culture')
In [16]:
def make_perc_series(df):
years_per = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
totalcount = group['gender'].count()
nmper = nmcount / float(totalcount)
years_per[year] = nmper
perc_series = pd.TimeSeries(data=years_per)
return perc_series
perc_dict = dict()
for name, group in culture_groups:
perc_series = make_perc_series(group)
perc_dict[name] = perc_series
In [34]:
perc_df.tail(10)
Out[34]:
In [35]:
perc_df = pd.DataFrame.from_dict(perc_dict)
years = range(1800,2000,int(200/6.0))
subbd_df = perc_df.ix[years]
infogram = subbd_df
infogram.to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')
In [37]:
fig, (full, modern) = plt.subplots(1,2, figsize=(20,6))
end_year = 2000
for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
ra_dict = dict()
for name, series in perc_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
cult_dob_per = pd.DataFrame(ra_dict)
if start_year == 1800:
year_list = range(1900,end_year,10)
cult_dob_per.ix[years].to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')
cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
ax.set_xlim((start_year, end_year))
ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
ax.set_ylim((0,0.6))
ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)
In [27]:
fig, (full, modern) = plt.subplots(2,1, figsize=(12,8), sharex=False)
end_year = 2000
for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
ra_dict = dict()
for name, series in perc_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
cult_dob_per = pd.DataFrame(ra_dict)
cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
ax.set_xlim((start_year, end_year))
ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
ax.set_ylim((0,0.6))
ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
#full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)
In [184]:
dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)
Out[184]:
In [185]:
culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')
In [186]:
def make_tot_series(df):
years_tot = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
totalcount = group['culture'].count()
years_tot[year] = totalcount
tot_series = pd.TimeSeries(data=years_tot)
return tot_series
tot_dict = dict()
for name, group in culture_groups:
tot_dict[name] = make_tot_series(group)
In [189]:
end_year = 2014
for start_year in [1500, 1800]:
for ra_len in [2, 5, 10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5)
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
In [188]:
for start_year, end_year in zip([-2000, -1000], [1000,1500]):
for ra_len in [1,2,10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5)
plt.set_ylim((0,50))
plt.set_yscale('log')
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
In [ ]: